Use output from Humann2 to assess the gene families, pathway abundance, and pathway coverage, assigned to Gardnerella vaginalis across samples.

Setup

Packages

library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1       ✔ purrr   0.3.2  
## ✔ tibble  2.1.1       ✔ dplyr   0.8.0.1
## ✔ tidyr   0.8.3       ✔ stringr 1.4.0  
## ✔ readr   1.3.1       ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(ggrepel)

File paths

gardRelativeAbundance <- read_delim("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/gard_mapping/clade_assignments/gardRelativeAbundance.tsv", delim = "\t") %>%
  mutate(SampleID=as.character(SampleID),
         Clade=as.character(Clade))
## Parsed with column specification:
## cols(
##   SampleID = col_double(),
##   Clade = col_double(),
##   n = col_double(),
##   propClade = col_double()
## )
sgDF <- read_delim("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/shotgunMetadata.tsv", delim = "\t")%>%
  mutate(SampleID=as.character(SampleID))
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   SampleIndex = col_character(),
##   SampleType = col_character(),
##   TermPre = col_character(),
##   Operator = col_character(),
##   PctTrim = col_character(),
##   RNAlater = col_character(),
##   Cohort = col_character()
## )
## See spec(...) for full column specifications.
load("/Users/hlberman/Documents/GitHub/pregnancy_metagenome/methods_comparisons/PS_RData/metaphlan_PS.RData")

# define file paths
dataDir <- "/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables"
GF_path <- file.path(dataDir, "VM_genefamilies_names_cpm.tsv")
GFP_path <- file.path(dataDir, "VM_genefamilies_paths_cpm.tsv")
GOnames_path <- file.path(dataDir, "map_go_name.txt")
GFGO_path <- file.path(dataDir, "map_go_uniref90.txt")
PWnames_path <- file.path(dataDir, "map_metacyc-pwy_name.txt") 
PC_path <- file.path(dataDir, "VM_pathcoverage.tsv")
PA_path <- file.path(dataDir, "VM_pathabundance_cpm.tsv")

# read in data frames
GF <- read_delim(GF_path, delim = "\t") # Gene family abundances
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `# Gene Family` = col_character()
## )
## See spec(...) for full column specifications.
GFP <- read_delim(GFP_path, delim = "\t") # pathways for each gene family
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `1000801248_Abundance` = col_double()
## )
## See spec(...) for full column specifications.
## Warning: 1662 parsing failures.
## row col    expected    actual                                                                                                                  file
##   1  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
##   2  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
##   3  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
##   4  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
##   5  -- 108 columns 2 columns '/Volumes/GoogleDrive/My Drive/Callahan Lab/vagMicrobiomeMetagenome/humann2/out_tables/VM_genefamilies_paths_cpm.tsv'
## ... ... ........... ......... .....................................................................................................................
## See problems(...) for more details.
PWnames <- read_delim(PWnames_path, delim = "\t", col_names = c("Pathway", "pathwayAnnotation"))
## Parsed with column specification:
## cols(
##   Pathway = col_character(),
##   pathwayAnnotation = col_character()
## )
PC <- read_delim(PC_path, delim = "\t") # pathway coverage 
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `# Pathway` = col_character()
## )
## See spec(...) for full column specifications.
PA <- read_delim(PA_path, delim = "\t") # pathway abundnance
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   `# Pathway` = col_character()
## )
## See spec(...) for full column specifications.
GOnames <- read_delim(GOnames_path, delim = "\t", col_names = c("GO", "Function")) # names and annotations of GO categories
## Parsed with column specification:
## cols(
##   GO = col_character(),
##   Function = col_character()
## )
GFGO <- read_lines(GFGO_path) %>% # read in the file as a list
  str_split("\t")

Get Gardnerella genes/pathways only

Re-format tables for dplyr

Paths for gene families

GFP0 <- GFP %>%
  select(`# Pathway`) %>% # keep only pathway and gene family names
  filter(str_detect(`# Pathway`, "Gardnerella")) %>% # keep only Gardnerella
  separate(`# Pathway`, c("Pathway", "GeneFamily"), sep ="\\|g__Gardnerella.s__Gardnerella_vaginalis\\|", fill = "right") %>% # separate 
  separate(GeneFamily, c("uniref", "unirefAnnotation"), ": ") %>%
  filter(uniref != "") # remove empty pathways
head(GFP0)
## # A tibble: 6 x 3
##   Pathway        uniref          unirefAnnotation                      
##   <chr>          <chr>           <chr>                                 
## 1 COA-PWY-1      UniRef90_I4LNX4 Dephospho-CoA kinase                  
## 2 COA-PWY-1      UniRef90_D2R9V1 Phosphopantetheine adenylyltransferase
## 3 COA-PWY-1      UniRef90_D2RAK6 Dephospho-CoA kinase                  
## 4 COA-PWY-1      UniRef90_D2RBT1 Type III pantothenate kinase          
## 5 NONOXIPENT-PWY UniRef90_E3D9X8 Transaldolase                         
## 6 NONOXIPENT-PWY UniRef90_D6SZM2 Transaldolase

GO categories for gene family

GOcategories <- map(GFGO, `[[`, 1) %>% # get the GO category labels
  unlist()
GFGO0 <- map2(GOcategories, GFGO, ~paste(.x, .y, sep=",")) %>% # paste a separable (important for later making a dataframe) GO category to each gene family in each category
  unlist() %>% # collapse list
  as_tibble() %>% # make dataframe
  separate(1, c("GO", "uniref"), ",") %>% #separate into columns
  filter(!str_detect(uniref, "GO:[0-9]*$")) # remove rows with GO category label in uniref column
## Warning: Calling `as_tibble()` on a vector is discouraged, because the behavior is likely to change in the future. Use `tibble::enframe(name = NULL)` instead.
## This warning is displayed once per session.
head(GFGO0)
## # A tibble: 6 x 2
##   GO         uniref             
##   <chr>      <chr>              
## 1 GO:0000001 UniRef90_A0A016PS51
## 2 GO:0000001 UniRef90_A1C8D3    
## 3 GO:0000001 UniRef90_A1CBR9    
## 4 GO:0000001 UniRef90_A1CE31    
## 5 GO:0000001 UniRef90_A1CNY1    
## 6 GO:0000001 UniRef90_A1CPB1

Gene Family

GF0 <- GF %>%
  filter(str_detect(`# Gene Family`, "Gardnerella")) %>%
  as.data.frame() 
rownames(GF0) <- GF0[,1]
GF1 <- GF0 %>%
  select(-`# Gene Family`)
GF2 <- GF1 %>%
  t() %>%
  as.data.frame(strinsAsFactors=FALSE) %>%
  mutate(SampleID=str_extract(colnames(GF1), "[0-9]{10}")) %>%
  gather("GeneFamily", "Abundance", contains("Gardnerella")) %>%
  mutate(GeneFamily=str_extract(GeneFamily, ".*(?=.g__Gardnerella)")) %>%
  separate(GeneFamily, c("uniref", "unirefAnnotation"), ": ", fill = "right") %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "GWdell", "GWcoll", "Cohort")], by="SampleID")
head(GF2)
##     SampleID           uniref unirefAnnotation Abundance SubjectID TermPre
## 1 1000801248 UniRef90_unknown             <NA>   293.681     10008       T
## 2 1000801318 UniRef90_unknown             <NA>   168.545     10008       T
## 3 1000801368 UniRef90_unknown             <NA>   262.059     10008       T
## 4 1001301158 UniRef90_unknown             <NA>  5946.180     10013       P
## 5 1001301248 UniRef90_unknown             <NA> 11876.100     10013       P
## 6 1001301318 UniRef90_unknown             <NA>   807.692     10013       P
##   GWdell GWcoll   Cohort
## 1     41     25 Stanford
## 2     41     32 Stanford
## 3     41     37 Stanford
## 4     35     16 Stanford
## 5     35     25 Stanford
## 6     35     31 Stanford

Pathway Abundance

PA0 <- PA %>%
  filter(str_detect(`# Pathway`, "Gardnerella")) %>%
  as.data.frame() 
rownames(PA0) <- PA0[,1]
PA1 <- PA0 %>%
  select(-`# Pathway`)
PA2 <- PA1 %>%
  t() %>%
  as.data.frame(strinsAsFactors=FALSE) %>%
  mutate(SampleID=str_extract(colnames(PA1), "[0-9]{10}")) %>%
  gather("Pathway", "Abundance", contains("Gardnerella")) %>%
  mutate(Pathway=str_extract(Pathway, ".*(?=.g__Gardnerella)")) %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "GWdell", "GWcoll", "Cohort")], by="SampleID")
head(PA2)
##     SampleID      Pathway Abundance SubjectID TermPre GWdell GWcoll
## 1 1000801248 UNINTEGRATED   5507.21     10008       T     41     25
## 2 1000801318 UNINTEGRATED   3295.92     10008       T     41     32
## 3 1000801368 UNINTEGRATED   4797.42     10008       T     41     37
## 4 1001301158 UNINTEGRATED  73881.90     10013       P     35     16
## 5 1001301248 UNINTEGRATED 174983.00     10013       P     35     25
## 6 1001301318 UNINTEGRATED  15372.60     10013       P     35     31
##     Cohort
## 1 Stanford
## 2 Stanford
## 3 Stanford
## 4 Stanford
## 5 Stanford
## 6 Stanford

Pathway coverage

PC0 <- PC %>%
  filter(str_detect(`# Pathway`, "Gardnerella")) %>%
  as.data.frame() 
rownames(PC0) <- PC0[,1]
PC1 <- PC0 %>%
  select(-`# Pathway`)
PC2 <- PC1 %>%
  t() %>%
  as.data.frame(strinsAsFactors=FALSE) %>%
  mutate(SampleID=str_extract(colnames(PC1), "[0-9]{10}")) %>%
  gather("Pathway", "Coverage", contains("Gardnerella")) %>%
  mutate(Pathway=str_extract(Pathway, ".*(?=.g__Gardnerella)")) %>%
  left_join(sgDF[,c("SampleID", "SubjectID", "TermPre", "GWdell", "GWcoll", "Cohort")], by="SampleID")
head(PC2)
##     SampleID      Pathway Coverage SubjectID TermPre GWdell GWcoll
## 1 1000801248 UNINTEGRATED        1     10008       T     41     25
## 2 1000801318 UNINTEGRATED        1     10008       T     41     32
## 3 1000801368 UNINTEGRATED        1     10008       T     41     37
## 4 1001301158 UNINTEGRATED        1     10013       P     35     16
## 5 1001301248 UNINTEGRATED        1     10013       P     35     25
## 6 1001301318 UNINTEGRATED        1     10013       P     35     31
##     Cohort
## 1 Stanford
## 2 Stanford
## 3 Stanford
## 4 Stanford
## 5 Stanford
## 6 Stanford

By Preterm Birth

Gene Families

Gene Familes present in both TB and PTB

geneFamilies <- unique(paste(GF2$uniref, GF2$unirefAnnotation, sep = ": "))
GFfoldChange <- GF2 %>%
  group_by(uniref, unirefAnnotation, TermPre, Cohort) %>%
  summarise(mean(Abundance)) %>%
  ungroup() %>%
  spread(TermPre, `mean(Abundance)`) %>%
  filter(Cohort=="UAB"|Cohort=="Stanford",
         `T`>0,
         P>0) %>%
  mutate(FoldChange=case_when(P>`T`~P/`T`,
                              `T`>P~-`T`/P)) #%>%
#  left_join(GFGO0, by="uniref") %>%  # for adding GO information to the dataframe. removed for now as multiple GO categories were assigned to many uniref 
#  left_join(GOnames, by= "GO")
  
labels1 <- GFfoldChange[order(GFfoldChange$FoldChange),] %>%
  select(uniref, unirefAnnotation, Cohort, FoldChange)
labels1 <- labels1[1:20,]
labels2 <- GFfoldChange[order(-GFfoldChange$FoldChange),] %>%
  select(uniref, unirefAnnotation, Cohort, FoldChange)
labels2 <- labels2[1:10,]
labels <- rbind(labels1, labels2) %>%
  filter(unirefAnnotation!="NO_NAME")

ggplot(GFfoldChange, aes(x=uniref, y=FoldChange, label = unirefAnnotation)) +
  geom_point() +
  theme(axis.text.x = element_blank()) +
  ggrepel::geom_text_repel(data=labels) +
  labs(y=element_blank()) +
  facet_wrap(~Cohort) +
  annotate("text", label = "greater in PTB", x=550, y = 500) +  
  annotate("text", label = "greater in TB",  x=500, y = -500)


## Gene Families with Pathway information

GFfoldChangePath <- GFfoldChange %>%
  left_join(GFP0, by=c("uniref", "unirefAnnotation")) %>%
  filter(!is.na(Pathway)) %>%
  left_join(PWnames, by="Pathway")

ggplot(GFfoldChangePath, aes(x=uniref, y=FoldChange, color = paste(Pathway, pathwayAnnotation))) +
   geom_point() +
   theme(axis.text.x = element_blank()) +
   labs(y=element_blank(), color="Pathway") +
   facet_wrap(~Cohort) + 
   annotate("text", label = "greater in PTB", x=7.5, y = 0.5, size = 2) +  
   annotate("text", label = "greater in TB",  x= 7, y = -0.5, size = 2 )


Of uniref 90 gene families had pathway annotations, greatest fold increase in PTB is Threonine synthase of the superpathway of L-threonine biosynthesis (5.4 fold) in the UAB cohort and Aspartate carbamoyltransferase of the UMP biosynthesis pathway (2.6 fold) in the Stanford cohort. greatest fold increase in TB, is Dephospho-CoA kinase in the coenzyme A biosynthesis II pathway in both cohorts, with a 19.6 fold increase in the Stanford cohort and 17.901915 fold increase in the UAB cohort. Based on these results, I may need to look at another pathway database (perhaps Kegg) or consider a custom set of references.


List of greatest fold change uniref gene families (pathways not annotated for greatest fold change uniref genes)

GFfoldChange %>%
  filter(FoldChange<=-2000|FoldChange>500) %>%
  print(n = nrow(.), width = getOption("tibble.width"), n_extra = NULL)
## # A tibble: 16 x 6
##    uniref    unirefAnnotation             Cohort       P       T FoldChange
##    <chr>     <chr>                        <chr>    <dbl>   <dbl>      <dbl>
##  1 UniRef90… NO_NAME                      Stanf… 0.00658 1.76e+1     -2671.
##  2 UniRef90… NO_NAME                      Stanf… 0.00860 1.79e+1     -2079.
##  3 UniRef90… NO_NAME                      Stanf… 0.00614 1.94e+1     -3159.
##  4 UniRef90… NO_NAME                      Stanf… 0.00485 1.94e+1     -4001.
##  5 UniRef90… NO_NAME                      Stanf… 0.00797 1.90e+1     -2382.
##  6 UniRef90… N-acetylmuramoyl-L-alanine … UAB    6.86    4.40e-3      1558.
##  7 UniRef90… 2-C-methyl-D-erythritol 4-p… UAB    2.92    5.08e-3       575.
##  8 UniRef90… NO_NAME                      UAB    2.79    2.99e-3       935.
##  9 UniRef90… NO_NAME                      UAB    3.73    5.06e-3       737.
## 10 UniRef90… Putative virion core protei… UAB    3.55    3.40e-3      1046.
## 11 UniRef90… NO_NAME                      UAB    3.06    4.94e-3       619.
## 12 UniRef90… NO_NAME                      UAB    2.87    3.43e-3       837.
## 13 UniRef90… Modification methyltransfer… UAB    0.00102 2.07e+0     -2036.
## 14 UniRef90… Putative surface-anchored p… UAB    2.15    4.03e-3       533.
## 15 UniRef90… NO_NAME                      Stanf… 0.00204 5.50e+0     -2695.
## 16 UniRef90… Repeat protein               Stanf… 1.08    1.09e-3       997.

Gene familes only present

alternatePresence <- GF2 %>%
  group_by(uniref, unirefAnnotation, TermPre, Cohort) %>%
  summarise(mean(Abundance)) %>%
  ungroup() %>%
  spread(TermPre, `mean(Abundance)`) %>%
  filter(Cohort=="UAB"|Cohort=="Stanford",
         `T`==0|P==0,
         !(`T`==0&P==0)) %>%
  mutate(`T`=-`T`) %>%
  gather("TermPre", "Abundance", c(`T`, P)) %>% #make abundance one column instead of separate T and P columsn
  filter(Abundance!=0) %>% # get rid of duplicate entries where abundance = 0  
  left_join(GFP0, by=c("uniref", "unirefAnnotation")) %>%
  left_join(PWnames, by="Pathway")

ggplot(alternatePresence, aes(x=reorder(uniref, Abundance), y=Abundance))+
  geom_col() + 
  theme(axis.text.x = element_blank()) + 
  labs(x="Uniref90", y="Average reads per million") +
  facet_wrap(~Cohort) +
  annotate("text", label = "greater in PTB", x=300, y = 25) +  
  annotate("text", label = "greater in TB",  x=300, y = -50)

# remove bulk of the figure with little variation
alternatePresence %>%
  filter(Abundance>=10|Abundance<=10) %>%
ggplot(aes(x=reorder(uniref, Abundance), y=Abundance)) +
  geom_col() + 
  theme(axis.text.x = element_blank()) + 
  labs(x="Uniref90", y="Average reads per million") +
  facet_wrap(~Cohort) + 
  annotate("text", label = "greater in PTB", x=300, y = 25) +  
  annotate("text", label = "greater in TB",  x=300, y = -50)


Save list of alternately present genes and show list of greatest differences

alternatePresence %>%
  filter(Abundance>=10|Abundance<=-20) %>%
  arrange(-Abundance) %>%
  print(n = nrow(.), width = getOption("tibble.width"), n_extra = NULL)
## # A tibble: 33 x 7
##    uniref unirefAnnotation Cohort TermPre Abundance Pathway
##    <chr>  <chr>            <chr>  <chr>       <dbl> <chr>  
##  1 UniRe… NO_NAME          UAB    P            53.7 <NA>   
##  2 UniRe… NO_NAME          Stanf… P            25.4 <NA>   
##  3 UniRe… NO_NAME          Stanf… P            17.0 <NA>   
##  4 UniRe… NO_NAME          UAB    P            16.1 <NA>   
##  5 UniRe… NO_NAME          Stanf… P            13.5 <NA>   
##  6 UniRe… NO_NAME          UAB    P            12.6 <NA>   
##  7 UniRe… NO_NAME          UAB    P            11.7 <NA>   
##  8 UniRe… NO_NAME          UAB    P            11.0 <NA>   
##  9 UniRe… NO_NAME          UAB    P            10.5 <NA>   
## 10 UniRe… NO_NAME          UAB    P            10.2 <NA>   
## 11 UniRe… NO_NAME          Stanf… T           -20.6 <NA>   
## 12 UniRe… NO_NAME          Stanf… T           -24.9 <NA>   
## 13 UniRe… NO_NAME          UAB    T           -25.4 <NA>   
## 14 UniRe… NO_NAME          UAB    T           -25.5 <NA>   
## 15 UniRe… NO_NAME          Stanf… T           -25.8 <NA>   
## 16 UniRe… NO_NAME          Stanf… T           -27.9 <NA>   
## 17 UniRe… Hypothetical me… UAB    T           -28.5 <NA>   
## 18 UniRe… NO_NAME          Stanf… T           -31.0 <NA>   
## 19 UniRe… Histidine kinase UAB    T           -33.5 <NA>   
## 20 UniRe… NO_NAME          Stanf… T           -34.2 <NA>   
## 21 UniRe… NO_NAME          UAB    T           -34.3 <NA>   
## 22 UniRe… NO_NAME          UAB    T           -34.6 <NA>   
## 23 UniRe… Response regula… UAB    T           -35.0 <NA>   
## 24 UniRe… Putative nucleo… UAB    T           -35.2 <NA>   
## 25 UniRe… NO_NAME          UAB    T           -39.9 <NA>   
## 26 UniRe… NO_NAME          Stanf… T           -42.4 <NA>   
## 27 UniRe… NO_NAME          UAB    T           -43.4 <NA>   
## 28 UniRe… NO_NAME          Stanf… T           -50.2 <NA>   
## 29 UniRe… NO_NAME          UAB    T           -90.1 <NA>   
## 30 UniRe… NO_NAME          UAB    T          -113.  <NA>   
## 31 UniRe… NO_NAME          Stanf… T          -116.  <NA>   
## 32 UniRe… NO_NAME          UAB    T          -122.  <NA>   
## 33 UniRe… NO_NAME          Stanf… T          -225.  <NA>   
## # … with 1 more variable: pathwayAnnotation <chr>


Need to continure to assess gene function and pathway, and also look at distribution across samples of each of these genes.

Pathway Abundance

Assess reads per million in each annotated pathway (MetaCyc) from Gardnerella vaginalis sample across in term versus preterm birth. Cumulative reads in these pathways attributed to Gardnerella vaginalis determined by Humann2.

pathways <- unique(PA2$Pathway)
pathways
##  [1] "UNINTEGRATED"                                                                                
##  [2] "COA-PWY-1: coenzyme A biosynthesis II (mammalian)"                                           
##  [3] "NONOXIPENT-PWY: pentose phosphate pathway (non-oxidative branch)"                            
##  [4] "PENTOSE-P-PWY: pentose phosphate pathway"                                                    
##  [5] "PEPTIDOGLYCANSYN-PWY: peptidoglycan biosynthesis I (meso-diaminopimelate containing)"        
##  [6] "PWY-5100: pyruvate fermentation to acetate and lactate II"                                   
##  [7] "PWY-5686: UMP biosynthesis"                                                                  
##  [8] "PWY-6122: 5-aminoimidazole ribonucleotide biosynthesis II"                                   
##  [9] "PWY-6151: S-adenosyl-L-methionine cycle I"                                                   
## [10] "PWY-6277: superpathway of 5-aminoimidazole ribonucleotide biosynthesis"                      
## [11] "PWY-6385: peptidoglycan biosynthesis III (mycobacteria)"                                     
## [12] "PWY-6386: UDP-N-acetylmuramoyl-pentapeptide biosynthesis II (lysine-containing)"             
## [13] "PWY-6387: UDP-N-acetylmuramoyl-pentapeptide biosynthesis I (meso-diaminopimelate containing)"
## [14] "PWY-7219: adenosine ribonucleotides de novo biosynthesis"                                    
## [15] "PWY-7221: guanosine ribonucleotides de novo biosynthesis"                                    
## [16] "THRESYN-PWY: superpathway of L-threonine biosynthesis"
map(pathways, ~filter(PA2, Pathway==.x) %>% 
  ggplot(., aes(x=TermPre, y=Abundance, color=TermPre)) +
  geom_jitter() +
  geom_boxplot(alpha=0) +
  scale_y_log10() +
  labs(title=.x, x=element_blank(), y="Abundance (reads per million") + 
  scale_x_discrete(labels = c("Preterm", "Term")) + 
  scale_color_manual(values=c("#56B4E9", "#D55E00")) +  
  theme(legend.position = "none") +
  facet_wrap(~Cohort))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]


Need to find cohort information of samples of unkown cohort. Some differences apparent between Stanford and UAB cohorts. of note is that no PTB samples have reads from the peptidoglycan biosynthesis III pathway in the Stanford cohort and only 3 Stanford term samples had reads from this pathway.

Pathway Coverage

PC3 <- PC2 %>%
  filter(Pathway!="UNINTEGRATED") # covereage value of "unintegrated" is meaningless
map(pathways[2:length(pathways)], ~filter(PC3, Pathway==.x) %>% # remove unintegrated from pathways list.
  ggplot(., aes(x=TermPre, y=Coverage, color=TermPre)) +
  geom_jitter() +
  geom_boxplot(alpha=0) +
  labs(title=.x, x=element_blank()) + 
  scale_x_discrete(labels = c("Preterm", "Term")) +
  ylim(0,1) +  
  scale_color_manual(values=c("#56B4E9", "#D55E00")) +  
  theme(legend.position = "none") +
  facet_wrap(~Cohort))
## [[1]]
## Warning: Removed 23 rows containing missing values (geom_point).

## 
## [[2]]
## Warning: Removed 47 rows containing missing values (geom_point).

## 
## [[3]]
## Warning: Removed 43 rows containing missing values (geom_point).

## 
## [[4]]
## Warning: Removed 27 rows containing missing values (geom_point).

## 
## [[5]]
## Warning: Removed 20 rows containing missing values (geom_point).

## 
## [[6]]
## Warning: Removed 12 rows containing missing values (geom_point).

## 
## [[7]]
## Warning: Removed 9 rows containing missing values (geom_point).

## 
## [[8]]
## Warning: Removed 15 rows containing missing values (geom_point).

## 
## [[9]]
## Warning: Removed 17 rows containing missing values (geom_point).

## 
## [[10]]
## Warning: Removed 53 rows containing missing values (geom_point).

## 
## [[11]]
## Warning: Removed 24 rows containing missing values (geom_point).

## 
## [[12]]
## Warning: Removed 19 rows containing missing values (geom_point).

## 
## [[13]]
## Warning: Removed 17 rows containing missing values (geom_point).

## 
## [[14]]
## Warning: Removed 17 rows containing missing values (geom_point).

## 
## [[15]]
## Warning: Removed 23 rows containing missing values (geom_point).


Coverage values vary from 0 to 1 and are a measure of the certainty that the pathway is present in the sample. Note that all coverage values for PWY-6385: peptidoglycan biosynthesis III (mycobacteria) were 0, jittering is causing it to appear that some values were greater than 0.

Continuing Goals

-Try assessing pathway information with KEGG pathways, since most gene families are unable to be mapped to a metacyc pathway (with provided Humann2 database), or consider a curated database. -Asses disrtibution across samples of certain gene familes. -Compare against strains that are present in each sample!!!

Session Info

sessionInfo()
## R version 3.5.2 (2018-12-20)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggrepel_0.8.1   forcats_0.4.0   stringr_1.4.0   dplyr_0.8.0.1  
##  [5] purrr_0.3.2     readr_1.3.1     tidyr_0.8.3     tibble_2.1.1   
##  [9] ggplot2_3.1.1   tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5 xfun_0.6         haven_2.1.0      lattice_0.20-38 
##  [5] colorspace_1.4-1 generics_0.0.2   htmltools_0.3.6  yaml_2.2.0      
##  [9] utf8_1.1.4       rlang_0.3.4      pillar_1.3.1     glue_1.3.1      
## [13] withr_2.1.2      modelr_0.1.4     readxl_1.3.1     plyr_1.8.4      
## [17] munsell_0.5.0    gtable_0.3.0     cellranger_1.1.0 rvest_0.3.3     
## [21] evaluate_0.13    labeling_0.3     knitr_1.22       fansi_0.4.0     
## [25] broom_0.5.2      Rcpp_1.0.1       scales_1.0.0     backports_1.1.4 
## [29] jsonlite_1.6     hms_0.4.2        digest_0.6.18    stringi_1.4.3   
## [33] grid_3.5.2       cli_1.1.0        tools_3.5.2      magrittr_1.5    
## [37] lazyeval_0.2.2   crayon_1.3.4     pkgconfig_2.0.2  xml2_1.2.0      
## [41] lubridate_1.7.4  assertthat_0.2.1 rmarkdown_1.12   httr_1.4.0      
## [45] rstudioapi_0.10  R6_2.4.0         nlme_3.1-139     compiler_3.5.2